{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HTML\n",
    "> The HyperText Markup Language or HTML is the standard markup language for documents designed to be displayed in a web browser.<br>\n",
    "超文本标记语言或 HTML 是设计用于在 Web 浏览器中显示的文档的标准标记语言。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='My First Heading\\n\\nMy first paragraph.', metadata={'source': '../data/fake-content.html'})]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain_community.document_loaders import UnstructuredHTMLLoader\n",
    "\n",
    "loader = UnstructuredHTMLLoader(\"../data/fake-content.html\")\n",
    "loader.load()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='\\n\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n', metadata={'source': '../data/fake-content.html', 'title': 'Test Title'})]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain_community.document_loaders import BSHTMLLoader\n",
    "\n",
    "loader = BSHTMLLoader(\"../data/fake-content.html\")\n",
    "loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %pip install --upgrade --quiet   spider-client\n",
    "from langchain_community.document_loaders import SpiderLoader\n",
    "\n",
    "loader = SpiderLoader(\n",
    "    api_key=\"YOUR_API_KEY\", url=\"https://spider.cloud\", mode=\"crawl\"\n",
    ")\n",
    "\n",
    "loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %pip install --upgrade --quiet   firecrawl-py\n",
    "\n",
    "from langchain_community.document_loaders import FireCrawlLoader\n",
    "\n",
    "\n",
    "loader = FireCrawlLoader(\n",
    "    api_key=\"YOUR_API_KEY\", url=\"https://firecrawl.dev\", mode=\"crawl\"\n",
    ")\n",
    "\n",
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %pip install --upgrade --quiet   azure-ai-documentintelligence\n",
    "\n",
    "from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader\n",
    "\n",
    "file_path = \"<filepath>\"\n",
    "endpoint = \"<endpoint>\"\n",
    "key = \"<key>\"\n",
    "loader = AzureAIDocumentIntelligenceLoader(\n",
    "    api_endpoint=endpoint, api_key=key, file_path=file_path, api_model=\"prebuilt-layout\"\n",
    ")\n",
    "\n",
    "documents = loader.load()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "langchain0_1",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
